Linear Regression

This is a House Price Prediction Using Linear Regression Model

At first we install all the required packages for the linear regression.

install.packages('readr')
install.packages('ggplot2')
install.packages('mlbench')
install.packages('corrplot')
install.packages('Amelia')
install.packages('caret')
install.packages('plotly')
install.packages('caTools')
install.packages('reshape2')
install.packages('dplyr')
library(readr)
library(ggplot2)
library(corrplot)
library(mlbench)
library(Amelia)
library(plotly)
library(reshape2)
library(caret)
library(caTools)
library(dplyr)

We input the cleaned dataset

data(Housing)
Warning in data(Housing) : data set ‘Housing’ not found
housing <- Housing

Exploratory Data Analysis

Visualizations

Correlation

corrplot(cor(select(housing,-chas)))

Density Plot using ggplot2

housing %>% 
  ggplot(aes(medv)) +
  stat_density() + 
  theme_bw()

Density Plot using plotly

ggplotly(housing %>% 
  ggplot(aes(medv)) +
  stat_density() + 
  theme_bw())

medv

housing %>%
  select(c(crim, rm, age, rad, tax, lstat, medv,indus,nox,ptratio,zn)) %>%
  melt(id.vars = "medv") %>%
  ggplot(aes(x = value, y = medv, colour = variable)) +
  geom_point(alpha = 0.7) +
  stat_smooth(aes(colour = "black")) +
  facet_wrap(~variable, scales = "free", ncol = 2) +
  labs(x = "Variable Value", y = "Median House Price ($1000s)") +
  theme_minimal()
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  pseudoinverse used at -0.5
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  neighborhood radius 13
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  reciprocal condition number  4.5194e-15
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,  :
  There are other near singularities as well. 156.25
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)),  :
  pseudoinverse used at -0.5
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)),  :
  neighborhood radius 13
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)),  :
  reciprocal condition number  4.5194e-15
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)),  :
  There are other near singularities as well. 156.25

Model Building & Prediction

Train and Test Data

set.seed(123)
split <- sample.split(housing,SplitRatio =0.75)
train <- subset(housing,split==TRUE)
test <- subset(housing,split==FALSE)

Training The Model

model <- lm(medv ~ crim + rm + tax + lstat , data = train)
summary(model)

Call:
lm(formula = medv ~ crim + rm + tax + lstat, data = train)

Residuals:
    Min      1Q  Median 
-16.849  -3.267  -1.009 
     3Q     Max 
  2.147  29.690 

Coefficients:
             Estimate
(Intercept) -4.320521
crim        -0.077099
rm           5.608630
tax         -0.004938
lstat       -0.494856
            Std. Error
(Intercept)   3.588604
crim          0.036376
rm            0.495616
tax           0.002070
lstat         0.058225
            t value Pr(>|t|)
(Intercept)  -1.204   0.2294
crim         -2.120   0.0347
rm           11.316  < 2e-16
tax          -2.386   0.0176
lstat        -8.499 5.27e-16
               
(Intercept)    
crim        *  
rm          ***
tax         *  
lstat       ***
---
Signif. codes:  
  0 ‘***’ 0.001 ‘**’ 0.01
  ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 5.203 on 357 degrees of freedom
Multiple R-squared:  0.6754,    Adjusted R-squared:  0.6718 
F-statistic: 185.7 on 4 and 357 DF,  p-value: < 2.2e-16

Visualizing The Model

res <- residuals(model)
res <- as.data.frame(res)
ggplot(res,aes(res)) +  geom_histogram(fill='blue',alpha=0.5)
`stat_bin()` using `bins =
30`. Pick better value with
`binwidth`.

plot(model)

Predictions

test$predicted.medv <- predict(model,test)

pl1 <-test %>% 
  ggplot(aes(medv,predicted.medv)) +
  geom_point(alpha=0.5) + 
  stat_smooth(aes(colour='black')) +
  xlab('Actual value of medv') +
  ylab('Predicted value of medv')+
  theme_bw()

ggplotly(pl1)
`geom_smooth()` using method = 'loess' and formula 'y ~ x'

Assessing our Model

error <- test$medv-test$predicted.medv
rmse <- sqrt(mean(error)^2)
LS0tCnRpdGxlOiAiQ1NFNDAyNyAtIEhvdXNlIFByaWNpbmciCm91dHB1dDoKICBodG1sX25vdGVib29rOiBkZWZhdWx0CiAgcGRmX2RvY3VtZW50OiBkZWZhdWx0CiAgd29yZF9kb2N1bWVudDogZGVmYXVsdAotLS0KCgojIExpbmVhciBSZWdyZXNzaW9uClRoaXMgaXMgYSBIb3VzZSBQcmljZSBQcmVkaWN0aW9uIFVzaW5nIExpbmVhciBSZWdyZXNzaW9uICBNb2RlbAoKQXQgZmlyc3Qgd2UgaW5zdGFsbCBhbGwgdGhlIHJlcXVpcmVkIHBhY2thZ2VzIGZvciB0aGUgbGluZWFyIHJlZ3Jlc3Npb24uCmBgYHtyfQppbnN0YWxsLnBhY2thZ2VzKCdyZWFkcicpCmluc3RhbGwucGFja2FnZXMoJ2dncGxvdDInKQppbnN0YWxsLnBhY2thZ2VzKCdtbGJlbmNoJykKaW5zdGFsbC5wYWNrYWdlcygnY29ycnBsb3QnKQppbnN0YWxsLnBhY2thZ2VzKCdBbWVsaWEnKQppbnN0YWxsLnBhY2thZ2VzKCdjYXJldCcpCmluc3RhbGwucGFja2FnZXMoJ3Bsb3RseScpCmluc3RhbGwucGFja2FnZXMoJ2NhVG9vbHMnKQppbnN0YWxsLnBhY2thZ2VzKCdyZXNoYXBlMicpCmluc3RhbGwucGFja2FnZXMoJ2RwbHlyJykKbGlicmFyeShyZWFkcikKbGlicmFyeShnZ3Bsb3QyKQpsaWJyYXJ5KGNvcnJwbG90KQpsaWJyYXJ5KG1sYmVuY2gpCmxpYnJhcnkoQW1lbGlhKQpsaWJyYXJ5KHBsb3RseSkKbGlicmFyeShyZXNoYXBlMikKbGlicmFyeShjYXJldCkKbGlicmFyeShjYVRvb2xzKQpsaWJyYXJ5KGRwbHlyKQoKYGBgCgpXZSBpbnB1dCB0aGUgY2xlYW5lZCBkYXRhc2V0CmBgYHtyfQpkYXRhKEhvdXNpbmcpCmhvdXNpbmcgPC0gSG91c2luZwpgYGAKCiMjIEV4cGxvcmF0b3J5IERhdGEgQW5hbHlzaXMKIyMjIFZpc3VhbGl6YXRpb25zCgojIyMjIENvcnJlbGF0aW9uCgpgYGB7cn0KY29ycnBsb3QoY29yKHNlbGVjdChob3VzaW5nLC1jaGFzKSkpCmBgYAoKIyMjIyBEZW5zaXR5IFBsb3QgdXNpbmcgZ2dwbG90MgpgYGB7cn0KaG91c2luZyAlPiUgCiAgZ2dwbG90KGFlcyhtZWR2KSkgKwogIHN0YXRfZGVuc2l0eSgpICsgCiAgdGhlbWVfYncoKQpgYGAKCiMjIyMgRGVuc2l0eSBQbG90IHVzaW5nIHBsb3RseQpgYGB7cn0KZ2dwbG90bHkoaG91c2luZyAlPiUgCiAgZ2dwbG90KGFlcyhtZWR2KSkgKwogIHN0YXRfZGVuc2l0eSgpICsgCiAgdGhlbWVfYncoKSkKYGBgCgojIyMjIG1lZHYKYGBge3J9CmhvdXNpbmcgJT4lCiAgc2VsZWN0KGMoY3JpbSwgcm0sIGFnZSwgcmFkLCB0YXgsIGxzdGF0LCBtZWR2LGluZHVzLG5veCxwdHJhdGlvLHpuKSkgJT4lCiAgbWVsdChpZC52YXJzID0gIm1lZHYiKSAlPiUKICBnZ3Bsb3QoYWVzKHggPSB2YWx1ZSwgeSA9IG1lZHYsIGNvbG91ciA9IHZhcmlhYmxlKSkgKwogIGdlb21fcG9pbnQoYWxwaGEgPSAwLjcpICsKICBzdGF0X3Ntb290aChhZXMoY29sb3VyID0gImJsYWNrIikpICsKICBmYWNldF93cmFwKH52YXJpYWJsZSwgc2NhbGVzID0gImZyZWUiLCBuY29sID0gMikgKwogIGxhYnMoeCA9ICJWYXJpYWJsZSBWYWx1ZSIsIHkgPSAiTWVkaWFuIEhvdXNlIFByaWNlICgkMTAwMHMpIikgKwogIHRoZW1lX21pbmltYWwoKQpgYGAKCgojIyBNb2RlbCBCdWlsZGluZyAmIFByZWRpY3Rpb24KIyMjIFRyYWluIGFuZCBUZXN0IERhdGEKYGBge3J9CnNldC5zZWVkKDEyMykKc3BsaXQgPC0gc2FtcGxlLnNwbGl0KGhvdXNpbmcsU3BsaXRSYXRpbyA9MC43NSkKdHJhaW4gPC0gc3Vic2V0KGhvdXNpbmcsc3BsaXQ9PVRSVUUpCnRlc3QgPC0gc3Vic2V0KGhvdXNpbmcsc3BsaXQ9PUZBTFNFKQpgYGAKCiMjIyBUcmFpbmluZyBUaGUgTW9kZWwKYGBge3J9Cm1vZGVsIDwtIGxtKG1lZHYgfiBjcmltICsgcm0gKyB0YXggKyBsc3RhdCAsIGRhdGEgPSB0cmFpbikKc3VtbWFyeShtb2RlbCkKYGBgCiMjIyBWaXN1YWxpemluZyBUaGUgTW9kZWwKYGBge3J9CnJlcyA8LSByZXNpZHVhbHMobW9kZWwpCnJlcyA8LSBhcy5kYXRhLmZyYW1lKHJlcykKZ2dwbG90KHJlcyxhZXMocmVzKSkgKyAgZ2VvbV9oaXN0b2dyYW0oZmlsbD0nYmx1ZScsYWxwaGE9MC41KQpgYGAKCmBgYHtyfQpwbG90KG1vZGVsKQpgYGAKCgojIyMgUHJlZGljdGlvbnMKYGBge3J9CnRlc3QkcHJlZGljdGVkLm1lZHYgPC0gcHJlZGljdChtb2RlbCx0ZXN0KQoKcGwxIDwtdGVzdCAlPiUgCiAgZ2dwbG90KGFlcyhtZWR2LHByZWRpY3RlZC5tZWR2KSkgKwogIGdlb21fcG9pbnQoYWxwaGE9MC41KSArIAogIHN0YXRfc21vb3RoKGFlcyhjb2xvdXI9J2JsYWNrJykpICsKICB4bGFiKCdBY3R1YWwgdmFsdWUgb2YgbWVkdicpICsKICB5bGFiKCdQcmVkaWN0ZWQgdmFsdWUgb2YgbWVkdicpKwogIHRoZW1lX2J3KCkKCmdncGxvdGx5KHBsMSkKYGBgCgojIyMgQXNzZXNzaW5nIG91ciBNb2RlbApgYGB7cn0KZXJyb3IgPC0gdGVzdCRtZWR2LXRlc3QkcHJlZGljdGVkLm1lZHYKcm1zZSA8LSBzcXJ0KG1lYW4oZXJyb3IpXjIpCmBgYAo=